{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "Imputing missing values through various strategies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn import datasets\n",
    "import numpy as np\n",
    "\n",
    "iris = datasets.load_iris()\n",
    "iris_X = iris.data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "masking_array = np.random.binomial(1, .25,iris_X.shape).astype(bool)\n",
    "iris_X[masking_array] = np.nan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ True, False, False, False],\n",
       "       [ True, False, False,  True],\n",
       "       [False, False,  True,  True],\n",
       "       [ True, False,  True, False],\n",
       "       [ True, False,  True, False]], dtype=bool)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "masking_array[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ nan,  3.5,  1.4,  0.2],\n",
       "       [ nan,  3. ,  1.4,  nan],\n",
       "       [ 4.7,  3.2,  nan,  nan],\n",
       "       [ nan,  3.1,  nan,  0.2],\n",
       "       [ nan,  3.6,  nan,  0.2]])"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "iris_X [:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 5.86371681,  3.5       ,  1.4       ,  0.2       ],\n",
       "       [ 5.86371681,  3.        ,  1.4       ,  1.2619469 ],\n",
       "       [ 4.7       ,  3.2       ,  3.78918919,  1.2619469 ],\n",
       "       [ 5.86371681,  3.1       ,  3.78918919,  0.2       ],\n",
       "       [ 5.86371681,  3.6       ,  3.78918919,  0.2       ]])"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn import preprocessing\n",
    "impute = preprocessing.Imputer()\n",
    "iris_X_prime = impute.fit_transform(iris_X)\n",
    "iris_X_prime[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5.8637168141592939"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "iris_X_prime[0, 0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "nan"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "iris_X[0, 0] "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 5.8,  3.5,  1.4,  0.2],\n",
       "       [ 5.8,  3. ,  1.4,  1.4],\n",
       "       [ 4.7,  3.2,  4.2,  1.4],\n",
       "       [ 5.8,  3.1,  4.2,  0.2],\n",
       "       [ 5.8,  3.6,  4.2,  0.2]])"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "impute = preprocessing.Imputer(strategy='median')\n",
    "iris_X_prime = impute.fit_transform(iris_X)\n",
    "iris_X_prime[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-1. ,  3.5,  1.4,  0.2],\n",
       "       [-1. ,  3. ,  1.4, -1. ],\n",
       "       [ 4.7,  3.2, -1. , -1. ],\n",
       "       [-1. ,  3.1, -1. ,  0.2],\n",
       "       [-1. ,  3.6, -1. ,  0.2]])"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "iris_X[np.isnan(iris_X)] = -1\n",
    "iris_X[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 5.86371681,  3.5       ,  1.4       ,  0.2       ],\n",
       "       [ 5.86371681,  3.        ,  1.4       ,  1.2619469 ],\n",
       "       [ 4.7       ,  3.2       ,  3.78918919,  1.2619469 ],\n",
       "       [ 5.86371681,  3.1       ,  3.78918919,  0.2       ],\n",
       "       [ 5.86371681,  3.6       ,  3.78918919,  0.2       ]])"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "impute = preprocessing.Imputer(missing_values=-1)\n",
    "iris_X_prime = impute.fit_transform(iris_X)\n",
    "iris_X_prime[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-1. ,  3.5,  1.4,  0.2],\n",
       "       [-1. ,  3. ,  1.4, -1. ],\n",
       "       [ 4.7,  3.2, -1. , -1. ],\n",
       "       [-1. ,  3.1, -1. ,  0.2],\n",
       "       [-1. ,  3.6, -1. ,  0.2]])"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "iris_X_prime = np.where(pd.DataFrame(iris_X).isnull(),-1,iris_X)\n",
    "iris_X_prime[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-1. ,  3.5,  1.4,  0.2],\n",
       "       [-1. ,  3. ,  1.4, -1. ],\n",
       "       [ 4.7,  3.2, -1. , -1. ],\n",
       "       [-1. ,  3.1, -1. ,  0.2],\n",
       "       [-1. ,  3.6, -1. ,  0.2]])"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.DataFrame(iris_X).fillna(-1)[:5].values"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
